Using generators to get numpy chunks out of TAQ data


In [1]:
from glob import glob
import raw_taq
import pandas as pd

In [2]:
import numpy as np
from statistics import mode

def print_stats(chunk):
    #find the max bid price
    max_price = max(chunk['Bid_Price'])

    #find the min bid price
    min_price = min(chunk['Bid_Price'])

    #find the mean of bid price
    avg_price = np.mean(chunk['Bid_Price'])

    #find the mod of bid price
    try:
        mod_price = mode(chunk['Bid_Price'])
    except StatisticsError:
        mod_price = np.nan
        
    #find the sd of bid price
    sd_price = np.std(chunk['Bid_Price'])

    print("Max bid price: ", max_price, "\n", "Min bid price: ", min_price, "\n", 
          "Mean bid price: ", avg_price, "\n", "Mod bid price: ", mod_price, "\n",
          "Standard deviation bid price: ", sd_price)

In [7]:
# You can run this if you update the raw_taq.py file
from importlib import reload
reload(raw_taq)


Out[7]:
<module 'raw_taq' from '/Users/dav/Projects/dlab-finance/pynbbo/raw_taq.py'>

Here, we grab whatever BBO file we can find


In [5]:
# I grab the [0]'th fname in the glob
fname = glob('../local_data/EQY_US_ALL_BBO_*.zip')[0]
test_run = raw_taq.TAQ2Chunks(fname)

In [6]:
chunk_gen = test_run.convert_taq(20)

In [26]:
type(chunk_gen)


Out[26]:
generator

In [7]:
# You can get one chunk this way
chunk = next(chunk_gen)
chunk[0]


Out[7]:
(b'P', b'A               ', 0.0, 0, 0.0, 0, b'R', b'P', b'P', 14, b'2', b'2', b' ', b'C', b' ', b' ', b' ', b' ', b' ', b' ', b' ', 1391676960.901)

In [28]:
# If you want just the type
chunk.dtype


Out[28]:
dtype([('Exchange', 'S1'), ('Symbol', 'S16'), ('Bid_Price', '<f8'), ('Bid_Size', '<i4'), ('Ask_Price', '<f8'), ('Ask_Size', '<i4'), ('Quote_Condition', 'S1'), ('Bid_Exchange', 'S1'), ('Ask_Exchange', 'S1'), ('Sequence_Number', '<i8'), ('National_BBO_Ind', 'S1'), ('NASDAQ_BBO_Ind', 'S1'), ('Quote_Cancel_Correction', 'S1'), ('Source_of_Quote', 'S1'), ('Retail_Interest_Indicator_RPI', 'S1'), ('Short_Sale_Restriction_Indicator', 'S1'), ('LULD_BBO_Indicator_CQS', 'S1'), ('LULD_BBO_Indicator_UTP', 'S1'), ('FINRA_ADF_MPID_Indicator', 'S1'), ('SIP_generated_Message_Identifier', 'S1'), ('National_BBO_LULD_Indicator', 'S1'), ('Time', '<f8')])

In [65]:
# Numpy record arrays support string indexing to get columns
print(chunk['Bid_Price'])
print(chunk["Ask_Price"])


[  0.     0.     0.    41.9   54.07  57.43  56.07   0.     0.     0.     0.
   0.     0.     0.     0.     0.     0.     0.     0.     0.  ]
[  0.    72.94  60.76  60.76  60.76  60.76  64.    63.29   0.    63.3    0.
  63.3    0.    63.29   0.    63.3    0.    63.3    0.    63.3 ]

In [66]:
# Numeric indexing gives a row
chunk[0]


Out[66]:
(b'P', b'A               ', 0.0, 0, 0.0, 0, b'R', b'P', b'P', 14, b'2', b'2', b' ', b'C', b' ', b' ', b' ', b' ', b' ', b' ', b' ', 1391676960.901)

In [31]:
# And you can do both
chunk['Bid_Price'][6]


Out[31]:
56.07

In [32]:
# Or
chunk[6]['Bid_Price']


Out[32]:
56.07

You can also easily convert numpy record arrays to pandas dataframes easily


In [13]:
chunk_df = pd.DataFrame(chunk)

In [14]:
chunk_df


Out[14]:
Exchange Symbol Bid_Price Bid_Size Ask_Price Ask_Size Quote_Condition Bid_Exchange Ask_Exchange Sequence_Number ... Quote_Cancel_Correction Source_of_Quote Retail_Interest_Indicator_RPI Short_Sale_Restriction_Indicator LULD_BBO_Indicator_CQS LULD_BBO_Indicator_UTP FINRA_ADF_MPID_Indicator SIP_generated_Message_Identifier National_BBO_LULD_Indicator Time
0 b'P' b'A ' 0.00 0 0.00 0 b'R' b'P' b'P' 14 ... b' ' b'C' b' ' b' ' b' ' b' ' b' ' b' ' b' ' 1.391677e+09
1 b'P' b'A ' 0.00 0 72.94 27 b'R' b'P' b'P' 76255 ... b' ' b'C' b' ' b' ' b' ' b' ' b' ' b' ' b' ' 1.391691e+09
2 b'P' b'A ' 0.00 0 60.76 10 b'R' b'P' b'P' 76256 ... b' ' b'C' b' ' b' ' b' ' b' ' b' ' b' ' b' ' 1.391691e+09
3 b'P' b'A ' 41.90 27 60.76 10 b'R' b'P' b'P' 76257 ... b' ' b'C' b' ' b' ' b' ' b' ' b' ' b' ' b' ' 1.391691e+09
4 b'P' b'A ' 54.07 27 60.76 10 b'R' b'P' b'P' 76258 ... b' ' b'C' b' ' b' ' b' ' b' ' b' ' b' ' b' ' 1.391691e+09
5 b'P' b'A ' 57.43 1 60.76 10 b'R' b'P' b'P' 78938 ... b' ' b'C' b' ' b' ' b' ' b' ' b' ' b' ' b' ' 1.391691e+09
6 b'K' b'A ' 56.07 3 64.00 1 b'R' b'K' b'K' 81017 ... b' ' b'C' b' ' b' ' b' ' b' ' b' ' b' ' b' ' 1.391691e+09
7 b'T' b'A ' 0.00 0 63.29 1 b'R' b'T' b'T' 81225 ... b' ' b'C' b' ' b' ' b' ' b' ' b' ' b' ' b' ' 1.391691e+09
8 b'T' b'A ' 0.00 0 0.00 0 b'R' b'T' b'T' 81598 ... b' ' b'C' b' ' b' ' b' ' b' ' b' ' b' ' b' ' 1.391691e+09
9 b'T' b'A ' 0.00 0 63.30 1 b'R' b'T' b'T' 81606 ... b' ' b'C' b' ' b' ' b' ' b' ' b' ' b' ' b' ' 1.391691e+09

10 rows × 22 columns


In [15]:
# note that time is not correctly parsed yet:
chunk_df.Time


Out[15]:
0    1.391677e+09
1    1.391691e+09
2    1.391691e+09
3    1.391691e+09
4    1.391691e+09
5    1.391691e+09
6    1.391691e+09
7    1.391691e+09
8    1.391691e+09
9    1.391691e+09
Name: Time, dtype: float64

Goal: Compute some summary statistics across a few securities in the TAQ file

Processing an entire TAQ file will take a long time. So, maybe just run through the chunks for the first two securities (you can then exit out of a loop once you see the third security / symbol).

A complete approach


In [10]:
chunk.dtype


Out[10]:
dtype([('Exchange', 'S1'), ('Symbol', 'S16'), ('Bid_Price', '<f8'), ('Bid_Size', '<i4'), ('Ask_Price', '<f8'), ('Ask_Size', '<i4'), ('Quote_Condition', 'S1'), ('Bid_Exchange', 'S1'), ('Ask_Exchange', 'S1'), ('Sequence_Number', '<i8'), ('National_BBO_Ind', 'S1'), ('NASDAQ_BBO_Ind', 'S1'), ('Quote_Cancel_Correction', 'S1'), ('Source_of_Quote', 'S1'), ('Retail_Interest_Indicator_RPI', 'S1'), ('Short_Sale_Restriction_Indicator', 'S1'), ('LULD_BBO_Indicator_CQS', 'S1'), ('LULD_BBO_Indicator_UTP', 'S1'), ('FINRA_ADF_MPID_Indicator', 'S1'), ('SIP_generated_Message_Identifier', 'S1'), ('National_BBO_LULD_Indicator', 'S1'), ('Time', '<f8')])

In [8]:
fname = glob('../local_data/EQY_US_ALL_BBO_*.zip')[0]
local_taq = raw_taq.TAQ2Chunks(fname)

chunk_gen = local_taq.convert_taq(20)
first_chunk = next(chunk_gen)
curr_symbol = first_chunk['Symbol_root'][0]

accum = pd.DataFrame(first_chunk)

processed_symbols = 0

for chunk in chunk_gen:
    where_symbol = curr_symbol == chunk['Symbol_root']
    if where_symbol.all():
        accum.append(pd.DataFrame(chunk))
    else:
        same = chunk[where_symbol]
        accum.append(pd.DataFrame(same))
        
        # Compute the stats
        print('Current symbol:', curr_symbol, len(curr_symbol), 'records')
        print_stats(accum)
        processed_symbols += 1
        if processed_symbols > 3:
            break
        
        diff = chunk[~where_symbol]
        accum = pd.DataFrame(diff)
        curr_symbol = accum.Symbol_root[0]

In [9]:
b'AA              ' == b'AA              '


Out[9]:
True

some simple examples of how generator functions work


In [16]:
def simple_fun(l):
    for item in l:
        yield item

In [17]:
simple_gen = simple_fun(['a', 'b', 1, 2])

In [18]:
type(simple_gen)


Out[18]:
generator

In [19]:
next(simple_gen)


Out[19]:
'a'

In [20]:
for item in simple_fun(['a', 'b', 1, 2]):
    print(item)


a
b
1
2

In [ ]: